Data from World bank, after download, also unzip into “your Working directory/data” and choose/import “WDIData.csv”.

- First, we will load the .csv (nearly 200 MB/it will take a while)

library(readr)
library(dplyr)
dat <- read_csv("data/WDIData.csv")

Data

- Check the imported data

dat

Data (different view)

- Check the data structure (via dplyr’s Glimpse function).

glimpse(dat)
## Observations: 409,992
## Variables: 63
## $ `Country Name`   <chr> "Arab World", "Arab World", "Arab World", "Ar...
## $ `Country Code`   <chr> "ARB", "ARB", "ARB", "ARB", "ARB", "ARB", "AR...
## $ `Indicator Name` <chr> "2005 PPP conversion factor, GDP (LCU per int...
## $ `Indicator Code` <chr> "PA.NUS.PPP.05", "PA.NUS.PRVT.PP.05", "EG.CFT...
## $ `1960`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1961`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1962`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1963`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1964`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1965`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1966`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1967`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1968`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1969`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1970`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1971`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1972`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1973`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1974`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1975`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1976`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1977`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1978`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1979`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1980`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1981`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1982`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1983`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1984`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1985`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1986`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1987`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1988`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1989`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `1990`           <dbl> NA, NA, NA, 76.14870, 59.35809, NA, NA, NA, N...
## $ `1991`           <dbl> NA, NA, NA, 76.56355, 60.17835, NA, NA, NA, N...
## $ `1992`           <dbl> NA, NA, NA, 74.06687, 61.29522, NA, NA, NA, N...
## $ `1993`           <dbl> NA, NA, NA, 74.84607, 62.10662, NA, NA, NA, N...
## $ `1994`           <dbl> NA, NA, NA, 75.30393, 62.96733, NA, NA, NA, N...
## $ `1995`           <dbl> NA, NA, NA, 76.04233, 63.72973, NA, NA, NA, N...
## $ `1996`           <dbl> NA, NA, NA, 76.76857, 64.70132, NA, NA, NA, N...
## $ `1997`           <dbl> NA, NA, NA, 77.14708, 64.95271, NA, NA, NA, N...
## $ `1998`           <dbl> NA, NA, NA, 77.98579, 66.38775, NA, NA, NA, N...
## $ `1999`           <dbl> NA, NA, NA, 78.55301, 64.08934, NA, NA, NA, N...
## $ `2000`           <dbl> NA, NA, 76.59969, 79.23617, 65.18909, NA, NA,...
## $ `2001`           <dbl> NA, NA, 77.44310, 79.76829, 65.52742, NA, NA,...
## $ `2002`           <dbl> NA, NA, 78.24495, 80.22959, 66.22084, NA, NA,...
## $ `2003`           <dbl> NA, NA, 79.01927, 80.96835, 67.48075, NA, NA,...
## $ `2004`           <dbl> NA, NA, 79.79252, 82.62879, 67.51666, 93.0769...
## $ `2005`           <dbl> NA, NA, 80.57069, 83.35198, 70.09179, 93.0009...
## $ `2006`           <dbl> NA, NA, 81.34937, 83.78790, 70.26531, 93.2334...
## $ `2007`           <dbl> NA, NA, 82.11228, 84.39166, 71.23760, 93.4736...
## $ `2008`           <dbl> NA, NA, 82.83193, 85.04225, 72.29232, 95.0940...
## $ `2009`           <dbl> NA, NA, 83.47462, 84.65534, 71.76534, 94.7233...
## $ `2010`           <dbl> NA, NA, 84.00608, 85.95535, 73.73427, 95.2998...
## $ `2011`           <dbl> NA, NA, 84.41615, 86.39231, 74.50072, 95.3942...
## $ `2012`           <dbl> NA, NA, 84.73457, 86.84697, 75.21850, 95.4750...
## $ `2013`           <dbl> NA, NA, 85.00364, 87.60496, 76.71225, 95.6688...
## $ `2014`           <dbl> NA, NA, 85.24497, 88.03912, 77.40727, 96.0886...
## $ `2015`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `2016`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ `2017`           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X63              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...

Let’s filter & tidy this mess

- Select one (GDP growth from 60’s till 2016) indicator for UK, USA and EU + Make checking data via Glimpse great again!

library(tidyr)
dc <- dat %>% 
filter(`Indicator Name`=="GDP growth (annual %)") %>%
filter(`Country Name`==c("United Kingdom", "United States", "European Union"))
glimpse(dc)
## Observations: 3
## Variables: 63
## $ `Country Name`   <chr> "European Union", "United Kingdom", "United S...
## $ `Country Code`   <chr> "EUU", "GBR", "USA"
## $ `Indicator Name` <chr> "GDP growth (annual %)", "GDP growth (annual ...
## $ `Indicator Code` <chr> "NY.GDP.MKTP.KD.ZG", "NY.GDP.MKTP.KD.ZG", "NY...
## $ `1960`           <dbl> NA, NA, NA
## $ `1961`           <dbl> 5.59226, 2.57357, 2.30000
## $ `1962`           <dbl> 5.031483, 1.327993, 6.100000
## $ `1963`           <dbl> 5.147830, 3.956241, 4.400000
## $ `1964`           <dbl> 5.567709, 5.038282, 5.800000
## $ `1965`           <dbl> 4.393252, 2.788977, 6.400000
## $ `1966`           <dbl> 4.298803, 2.049311, 6.500000
## $ `1967`           <dbl> 4.456442, 2.310965, 2.500000
## $ `1968`           <dbl> 5.064403, 3.984738, 4.800000
## $ `1969`           <dbl> 5.787202, 2.052446, 3.100000
## $ `1970`           <dbl> 5.608831, 5.957661, 3.206807
## $ `1971`           <dbl> 3.639819, 3.479335, 3.295477
## $ `1972`           <dbl> 4.714796, 4.294470, 5.263263
## $ `1973`           <dbl> 6.105374, 6.516075, 5.643125
## $ `1974`           <dbl> 2.2240886, -2.4726740, -0.5171546
## $ `1975`           <dbl> -0.7789266, -1.4883432, -0.1976785
## $ `1976`           <dbl> 4.598848, 2.921780, 5.386090
## $ `1977`           <dbl> 2.813866, 2.463214, 4.608597
## $ `1978`           <dbl> 3.224537, 4.195285, 5.561685
## $ `1979`           <dbl> 3.830662, 3.735497, 3.175691
## $ `1980`           <dbl> 1.4696051, -2.0411644, -0.2445962
## $ `1981`           <dbl> 0.3204955, -0.7789262, 2.5944704
## $ `1982`           <dbl> 0.9862215, 2.0156720, -1.9108911
## $ `1983`           <dbl> 1.834572, 4.220573, 4.632457
## $ `1984`           <dbl> 2.453812, 2.274567, 7.259087
## $ `1985`           <dbl> 2.614133, 4.187306, 4.238738
## $ `1986`           <dbl> 2.670649, 3.153252, 3.511614
## $ `1987`           <dbl> 2.944893, 5.359485, 3.461748
## $ `1988`           <dbl> 4.409114, 5.787400, 4.203972
## $ `1989`           <dbl> 3.725087, 2.582895, 3.680524
## $ `1990`           <dbl> 2.971492, 0.716883, 1.919370
## $ `1991`           <dbl> 1.42471574, -1.11898405, -0.07408453
## $ `1992`           <dbl> 1.0580811, 0.3598724, 3.5553961
## $ `1993`           <dbl> -0.1512518, 2.5070137, 2.7458567
## $ `1994`           <dbl> 2.836740, 3.885075, 4.037643
## $ `1995`           <dbl> 2.686671, 2.506373, 2.718976
## $ `1996`           <dbl> 1.996811, 2.548734, 3.795881
## $ `1997`           <dbl> 2.772792, 3.127178, 4.487026
## $ `1998`           <dbl> 2.977477, 3.190779, 4.449911
## $ `1999`           <dbl> 3.032757, 3.283348, 4.685200
## $ `2000`           <dbl> 3.881161, 3.744962, 4.092176
## $ `2001`           <dbl> 2.2360380, 2.7261073, 0.9759818
## $ `2002`           <dbl> 1.336565, 2.397248, 1.786128
## $ `2003`           <dbl> 1.334203, 3.466239, 2.806776
## $ `2004`           <dbl> 2.594107, 2.527877, 3.785743
## $ `2005`           <dbl> 2.084431, 2.972096, 3.345216
## $ `2006`           <dbl> 3.357783, 2.503009, 2.666626
## $ `2007`           <dbl> 3.086410, 2.555819, 1.778570
## $ `2008`           <dbl> 0.4581657, -0.6272052, -0.2916215
## $ `2009`           <dbl> -4.383413, -4.327738, -2.775530
## $ `2010`           <dbl> 2.153152, 1.915162, 2.531921
## $ `2011`           <dbl> 1.665908, 1.509062, 1.601455
## $ `2012`           <dbl> -0.4718219, 1.3130186, 2.2240309
## $ `2013`           <dbl> 0.2223498, 1.9110784, 1.6773315
## $ `2014`           <dbl> 1.669613, 3.070484, 2.370458
## $ `2015`           <dbl> 2.202898, 2.194229, 2.596148
## $ `2016`           <dbl> 1.873942, 1.806018, 1.615656
## $ `2017`           <dbl> NA, NA, NA
## $ X63              <chr> NA, NA, NA

Still not tidy enough!

- Let’s tidy some more (select only what we need & transform from wide to long dataset & NA’s out).

- & Take a Glimpse again.

names(dc) 
##  [1] "Country Name"   "Country Code"   "Indicator Name" "Indicator Code"
##  [5] "1960"           "1961"           "1962"           "1963"          
##  [9] "1964"           "1965"           "1966"           "1967"          
## [13] "1968"           "1969"           "1970"           "1971"          
## [17] "1972"           "1973"           "1974"           "1975"          
## [21] "1976"           "1977"           "1978"           "1979"          
## [25] "1980"           "1981"           "1982"           "1983"          
## [29] "1984"           "1985"           "1986"           "1987"          
## [33] "1988"           "1989"           "1990"           "1991"          
## [37] "1992"           "1993"           "1994"           "1995"          
## [41] "1996"           "1997"           "1998"           "1999"          
## [45] "2000"           "2001"           "2002"           "2003"          
## [49] "2004"           "2005"           "2006"           "2007"          
## [53] "2008"           "2009"           "2010"           "2011"          
## [57] "2012"           "2013"           "2014"           "2015"          
## [61] "2016"           "2017"           "X63"
dc1 <- select(dc, "Country Name", "Indicator Name", 5:61)
dc2 <- gather(dc1, key="Year", value="GDP growth (annual %)", -"Country Name", na.rm=TRUE) 
glimpse(dc2)
## Observations: 171
## Variables: 3
## $ `Country Name`          <chr> "European Union", "United Kingdom", "U...
## $ Year                    <chr> "Indicator Name", "Indicator Name", "I...
## $ `GDP growth (annual %)` <chr> "GDP growth (annual %)", "GDP growth (...

Still not a tidy dataset!

- Let’s remove first 3 rows (no data there).

- & Glimpse again.

dc2
dc2 <- dc2[-c(1:3), ]
dc2
dc2$Year <- as.numeric(dc2$Year)
dc2$`GDP growth (annual %)` <- as.numeric(dc2$`GDP growth (annual %)`)
class(dc2$Year)
## [1] "numeric"
class(dc2$`GDP growth (annual %)`)
## [1] "numeric"
glimpse(dc2)
## Observations: 168
## Variables: 3
## $ `Country Name`          <chr> "European Union", "United Kingdom", "U...
## $ Year                    <dbl> 1961, 1961, 1961, 1962, 1962, 1962, 19...
## $ `GDP growth (annual %)` <dbl> 5.592260, 2.573570, 2.300000, 5.031483...

Tidy enough & We can start plotting now!

- Let’s import rbokeh and make it a line plot by a country.

- Pipe (%>%) will make your figure “smoking hot” (+you can chain the customizations by it too) & set the size, put out the Bokeh logo and add a title.

library(rbokeh)
p <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name")
p

## Congratulations, your first plot! But still not publication quality… let’s customize some more… #### - We will add points too. #### - Put it black and transparent (using alpha transparency)

library(rbokeh)
q <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name") %>%
ly_points(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, size = 4, col = "black") 
q

## Not bad! But still not publication quality… what about the legend? (it is covering our points) #### - Let’s make it transparent too (Pipe it via theme_legend). #### - Let’s change the default font too (everybody ♥ Garamond, right?).

library(rbokeh)
r <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name") %>%
ly_points(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, size = 4, col = "black") %>%
theme_legend(border_line_width = 1, background_fill_alpha = 0.1, label_text_font_size = "8pt", label_text_align = "left", label_text_font = "Garamond", label_text_font_style = "bold") 
r

## Getting better! But still not publication quality… what about the title? #### - Let’s make it bigger & Garamond too (Pipe it via theme_title).

library(rbokeh)
s <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name") %>%
ly_points(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, size = 4, col = "black") %>%
theme_legend(border_line_width = 1, background_fill_alpha = 0.1, label_text_font_size = "8pt", label_text_align = "left", label_text_font = "Garamond", label_text_font_style = "bold") %>%
theme_title(text_align = "center", text_font = "Garamond", text_font_size = "14pt", text_baseline = "bottom") 
s

## Definitely better! But still not publication quality… what about the axis? #### - Let’s make it bold & Garamond too (Pipe it via theme_axis). #### - Let’s change major_label font too (everybody still ♥ Garamond, right?).

library(rbokeh)
t <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name") %>%
ly_points(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, size = 4, col = "black") %>%
theme_legend(border_line_width = 1, background_fill_alpha = 0.1, label_text_font_size = "8pt", label_text_align = "left", label_text_font = "Garamond", label_text_font_style = "bold") %>%
theme_title(text_align = "center", text_font = "Garamond", text_font_size = "14pt", text_baseline = "bottom") %>%
theme_axis(axis_label_text_font = "Garamond",
axis_label_text_font_size = "12pt", axis_label_text_font_style = "bold", major_label_text_font = "Garamond", major_label_text_font_size = "10pt",
major_label_text_font_style = "bold") 
t

## Nearly there! But still not publication quality… it is growth, right? What about to make it visually distinctive (divide positive/negative growth)? #### - rbokeh allows us to do this via ly_abline, so let’s Pipe it there ([a,b] to [0,0]).

library(rbokeh)
library(htmlwidgets)
u <- figure(width = 600, height = 350, legend_location = "top_right", title = "GDP Growth (%) USA, EU, UK", logo = NULL, tools = c("pan", "wheel_zoom", "box_zoom", "box_select", "reset", "resize")) %>%
  ly_lines(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, width = 5, col = "Country Name") %>%
ly_points(x="Year", y="GDP growth (annual %)", data = dc2,
    alpha = 0.5, size = 4, col = "black") %>%
theme_legend(border_line_width = 1, background_fill_alpha = 0.1, label_text_font_size = "8pt", label_text_align = "left", label_text_font = "Garamond", label_text_font_style = "bold") %>%
theme_title(text_align = "center", text_font = "Garamond", text_font_size = "14pt", text_baseline = "bottom") %>%
theme_axis(axis_label_text_font = "Garamond",
axis_label_text_font_size = "12pt", axis_label_text_font_style = "bold", major_label_text_font = "Garamond", major_label_text_font_size = "10pt",
major_label_text_font_style = "bold") %>%
ly_abline(a = 0, b = 0, v = NULL, h = NULL, coef = NULL,
color = "black", width = 1, type = 1, legend = NULL,
visible = TRUE) %>%
tool_lasso_select()
u
saveWidget(u, file="rbokeh001.html")

Congratulations! You have now the publication quality figure ready for you & it’s all interactive!